{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Speech recognition"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Simple sppeech recognition system can be implemented using DTW + MFCC. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import os\n",
    "import glob\n",
    "import librosa\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will use the [data-speech-commands database](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz) composed of 105,000 WAVE audio files of people saying thirty different words. We will use only a subset of this database.\n",
    "\n",
    "We assume that you have previously downloaded and extracted the database. You need to specify the path to the folder where you extracted it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATABASE_PATH = '/Users/pierrerouanet/Downloads/data-speech_commands_v0.02'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cat', 'dog', 'happy', 'house', 'zero'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = {'cat', 'dog', 'house', 'happy', 'zero'}\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We will use only N occurences per word\n",
    "N = 25"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Precompute all MFCCs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "mfccs = []\n",
    "true_labels = []\n",
    "\n",
    "for l in labels:\n",
    "    sounds = glob.glob(os.path.join(DATABASE_PATH, l, '*.wav'))\n",
    "    np.random.shuffle(sounds)\n",
    "    sounds = sounds[:N]\n",
    "    \n",
    "    for s in sounds:\n",
    "        y, sr = librosa.load(s)\n",
    "        mfcc = librosa.feature.mfcc(y, sr, n_mfcc=13)\n",
    "        mfccs.append(mfcc.T)\n",
    "        true_labels.append(l)\n",
    "        \n",
    "mfccs = np.array(mfccs)\n",
    "true_labels = np.array(true_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare train/val dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_percent = 0.2\n",
    "n_val = int(val_percent * len(true_labels))\n",
    "\n",
    "I = np.random.permutation(len(true_labels))\n",
    "I_val, I_train = I[:n_val], I[n_val:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Leave P Out Cross Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dtw import dtw\n",
    "\n",
    "\n",
    "def cross_validation(train_indices, val_indices):\n",
    "    score = 0.0\n",
    "\n",
    "    for i in val_indices:\n",
    "        x = mfccs[i]\n",
    "\n",
    "        dmin, jmin = np.inf, -1\n",
    "        for j in train_indices:\n",
    "            y = mfccs[j]\n",
    "            d, _, _, _ = dtw(x, y, dist=lambda x, y: np.linalg.norm(x - y, ord=1))\n",
    "\n",
    "            if d < dmin:\n",
    "                dmin = d\n",
    "                jmin = j\n",
    "\n",
    "        score += 1.0 if (true_labels[i] == true_labels[jmin]) else 0.0\n",
    "        \n",
    "    return score / len(val_indices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recognition rate 52.0%\n"
     ]
    }
   ],
   "source": [
    "rec_rate = cross_validation(I_train, I_val)\n",
    "print('Recognition rate {}%'.format(100. * rec_rate))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
